Introduction

Where do you think autumn-spawning Atlantic herring spawns? In order to explore the spawning distribution of Atlantic herring, species distribution models were created for larvae of herring.

1. Collect data and build model

1.1 Collect & organize occurrence data

We collect data of Atlantic herring larvae occurrences from 2000 - 2020 from the International Herring Larvae Surveys (IHLS). The occurrence dataset is read from a parquet file stored in the data lake.

# the file to process
acf <- S3FileSystem$create(
  anonymous = T,
  scheme = "https",
  endpoint_override = "s3.waw3-1.cloudferro.com"
)

eurobis <- arrow::open_dataset(acf$path("emodnet/biology/eurobis_occurence_data/eurobisgeoparquet/eurobis_no_partition_sorted.parquet" ))
df_occs <- eurobis |> 
  filter(aphiaidaccepted==126417, datasetid==4423,
         longitude > -12, longitude < 10,
         latitude > 48, latitude < 62,
         observationdate >= as.POSIXct("2000-01-01"),
         observationdate <= as.POSIXct("2020-12-31")) |> 
  collect() 

glimpse(df_occs)
## Rows: 7,902
## Columns: 10
## $ occurrenceid            <int> 18245864, 18245865, 18245866, 18245869, 182458…
## $ datasetid               <int> 4423, 4423, 4423, 4423, 4423, 4423, 4423, 4423…
## $ observationdate         <dttm> 2015-09-26, 2015-09-26, 2015-09-26, 2015-09-2…
## $ longitude               <dbl> -3.833333, -3.833333, -3.833333, -3.833333, -3…
## $ latitude                <dbl> 58.58333, 58.75000, 58.91667, 59.08333, 59.416…
## $ scientificname          <chr> "Clupea harengus", "Clupea harengus", "Clupea …
## $ aphiaid                 <int> 126417, 126417, 126417, 126417, 126417, 126417…
## $ scientificname_accepted <chr> "Clupea harengus", "Clupea harengus", "Clupea …
## $ aphiaidaccepted         <int> 126417, 126417, 126417, 126417, 126417, 126417…
## $ taxonrank               <int> 220, 220, 220, 220, 220, 220, 220, 220, 220, 2…
df_occs <- df_occs %>% 
  select(Latitude=latitude,
         Longitude=longitude, 
         Time=observationdate) %>%
  mutate(year = year(Time),
         month = month(Time))

mapview(df_occs %>% dplyr::select(Longitude) %>% pull,
        df_occs %>% dplyr::select(Latitude) %>% pull,
        crs = "epsg:4326")
table(df_occs$month)
## 
##    1    9   10   12 
## 2293 4480   99 1030

1.2 Remove duplicates

We start with 7902 occurrences.

df_occs <- df_occs %>%
  distinct(year, month, Longitude, Latitude, .keep_all = TRUE)

After removal of duplicates, 5347 occurrences are left.

1.3 Reduce sampling bias

# Thin towards distance of 10 NM or 18.52 km
# This is the distance between valid hauls in ICES trawl surveys

df_occs_thinned <- df_occs[0,] %>% select(-Time)
for (y in 1:length(2000:2020)) {
  for (m in 1:length(1:12)) {
    tmp_df <- df_occs %>% filter(year == c(2000:2020)[y],
                                 month == m)
    tmp_df_thinned <- spThin::thin(tmp_df %>% mutate(species = "Atlantic herring"),
                                   lat.col = "Latitude", long.col = "Longitude",
                                   spec.col = "species", thin.par = 18.52, reps = 1,
                                   write.files = FALSE, locs.thinned.list.return = TRUE,
                                   verbose = FALSE)[[1]]
    
    tmp_df_thinned <- tmp_df_thinned %>%
      mutate(year = c(2000:2020)[y],
             month = m)
    
    df_occs_thinned <- rbind(df_occs_thinned, tmp_df_thinned)
  }
  print(paste0(c(2000:2020)[y], " done"))
}

save(df_occs_thinned, file = "data/df_thinned.Rdata")
load("data/df_thinned.Rdata")

nrow(df_occs_thinned)
## [1] 3995
# 3995

mapview(df_occs_thinned %>% dplyr::select(Longitude) %>% pull,
        df_occs_thinned %>% dplyr::select(Latitude) %>% pull,
        crs = "epsg:4326")

1.4 Create background points

# lets take a spatial buffer of 100 km around the occurrence points for this example

abs <- spatiotemp_pseudoabs(spatial.method = "buffer", temporal.method = "random",
                            occ.data = df_occs_thinned %>% mutate(x = Longitude, y = Latitude),
                            temporal.ext = c("2000-01-01", "2020-12-31"), spatial.buffer = 100000,
                            n.pseudoabs = 10000)

glimpse(abs)

#limit temporal values of background points to months where occurrences of larvae are present
set.seed(123)
abs <- abs %>%
  mutate(month = sample(unique(df_occs_thinned$month), nrow(abs), replace = TRUE)) %>%
  mutate(Longitude = x, Latitude = y) %>%
  select(-day, -x, -y)
glimpse(abs)

save(abs, file = "zarr_extraction/absences_save.Rdata")

1.5 Sample environmental variables at occurrences and background points

load("zarr_extraction/SAVE/absences_save.Rdata")

#combine occurrences and background points
df_occ_bg <- rbind(df_occs_thinned %>% mutate(presence = 1), 
                   abs %>% mutate(presence = 0))
glimpse(df_occ_bg)


source("zarr_extraction/editoTools.R")
options("outputdebug"=c('L','M'))
load(file = "zarr_extraction/editostacv2.par")

#the requested timestep resolution of the dataset in milliseconds
#in this case we work with monthly data (1 month = 30.436875*24*3600*1000 = 2629746000 milliseconds)
timeSteps=c(2629746000)

##TODO: ADD ELEVATION, WINDFARMS AND SEABED SUBSTRATE ENERGY ------
parameters = list("thetao" = c("par" = "thetao", "fun" = "mean", "buffer" = "18520"),
                  "so" = c("par" = "so", "fun" = "mean", "buffer" = "18520"),
                  "zooc" = c("par" = "zooc", "fun" = "mean", "buffer" = "18520"),
                  "phyc" = c("par" = "phyc", "fun" = "mean", "buffer" = "18520"))

#check if they are all available in the data lake
for (parameter in parameters) {
  param = ifelse(length(parameter) == 1, parameter, parameter["par"])
  if(! param %in% unique(EDITOSTAC$par)) dbl("Unknown parameter ", param)
}

#extract function (requires POSIXct Time column)
df_occ_bg_env = enhanceDF(inputPoints = df_occ_bg %>% 
                            mutate(Time = as.POSIXct(paste(year,month,1,sep = "-"))),
                          requestedParameters = parameters,
                          requestedTimeSteps = timeSteps,
                          stacCatalogue = EDITOSTAC,
                          verbose="on")

glimpse(df_occ_bg_env)

df_occ_bg_env <- df_occ_bg_env %>% select(Longitude, Latitude, year, month,
                                          thetao, so, zooc, phyc)

# remove observations where no environmental values were present
df_occ_bg_env <- drop_na(df_occ_bg_env)
table(df_occ_bg_env$presence)
#    0    1 
# 7617 3910 


## TODO remove this part ----
substr_lvl <- tibble(sub_char = c("Fine mud", "Sand", "Muddy sand", "Mixed sediment",
                                  "Coarse substrate","Sandy mud or Muddy sand", "Seabed",
                                  "Rock or other hard substrata","Sandy mud", "Sandy mud or Muddy sand ",
                                  "Sediment","Fine mud or Sandy mud or Muddy sand"),
                     seabed_substrate = c(1:12))
energy_lvl <- tibble(ene_char = c("High energy", "Moderate energy", "Low energy", "No energy information"),
                     seabed_energy = c(1:4))

df_occ_bg_env <- df_occ_bg_env %>%
  left_join(energy_lvl, by = "seabed_energy") %>%
  left_join(substr_lvl, by = "seabed_substrate") %>%
  dplyr::select(-seabed_energy, -seabed_substrate) %>%
  mutate(windfarms = as.factor(windfarms))

save(df_occ_bg_env, file = "zarr_extraction/SAVE/pres_abs_env_save.Rdata")

1.6 Model creation using ENMeval

load(file = "zarr_extraction/SAVE/pres_abs_env_save.Rdata")

glimpse(df_occ_bg_env)

# input for ENMevaluate requires lon & lat as first covariates

#test different model settings using ENMeval
model_fit <- ENMeval::ENMevaluate(occs = df_occ_bg_env %>%                                    
                                    filter(presence == 1) %>% 
                                    dplyr::select(Longitude, Latitude, depth, Phyto, SSS, SST, windfarms, ZooPl, ene_char, sub_char),
                                  bg = df_occ_bg_env %>% 
                                    filter(presence == 0) %>% 
                                    dplyr::select(Longitude, Latitude, depth, Phyto, SSS, SST, windfarms, ZooPl, ene_char, sub_char),
                                  tune.args = list(fc = c("L","LQ","LQH"),
                                                   rm = c(1,2,4,8,32)),
                                  algorithm = "maxnet",
                                  partitions = "randomkfold",
                                  categoricals = c("sub_char", "ene_char", "windfarms"),
                                  doClamp = TRUE,
                                  parallel = TRUE)


save(model_fit, file = "SAVE/model_fit_save.Rdata")

Show results from ENMevaluate

load("zarr_extraction/SAVE/model_fit_save.Rdata")

print(model_fit %>% eval.results() %>% filter(delta.AICc == 0))
##    fc rm   tune.args auc.train cbi.train auc.diff.avg auc.diff.sd auc.val.avg
## 1 LQH  1 fc.LQH_rm.1 0.8090342        NA  0.003626047    0.003206   0.8084254
##    auc.val.sd cbi.val.avg cbi.val.sd or.10p.avg  or.10p.sd   or.mtp.avg
## 1 0.003942938          NA         NA  0.1025575 0.01737908 0.0002557545
##      or.mtp.sd     AICc delta.AICc w.AIC ncoef
## 1 0.0005718844 70485.62          0     1    32

2. Analyse model

2.1 Evaluate model

The default output only shows the AUC. We also want to calculated the TSS, TPR and TNR.

load(file = "zarr_extraction/SAVE/pres_abs_env_save.Rdata")

AUC_maxent <- list()
TSS_maxent <- list()

Presences <- df_occ_bg_env %>%                                    
  filter(presence == 1) %>% 
  dplyr::select(Longitude, Latitude, depth, Phyto, SSS, SST, windfarms, ZooPl, ene_char, sub_char)
Background <- df_occ_bg_env %>%                                    
  filter(presence == 0) %>% 
  dplyr::select(Longitude, Latitude, depth, Phyto, SSS, SST, windfarms, ZooPl, ene_char, sub_char)

tmp_AUC <- list()
tmp_TSS <- list()
for (i in 1:10) {
  #Generate "k" groups:
  k<-4 #division of the data will be 75% Vs 25%
  
  groups_pres<-kfold(Presences,k) #Kfold divide the data, assigning every row to one of the K groups randomly.
  groups_abs<-kfold(Background,k)
  
  #Four groups will be used to generate the model and the rest of the point (one group) will be used to evaluate it:
  EvalBg<-Background[groups_pres==1,]
  TrainBg<-Background[groups_pres!=1,]
  
  EvalPres<-Presences[groups_pres==1,]
  TrainPres<-Presences[groups_pres!=1,]
  
  #get model settings
  eval_res <- model_fit
  opt.aicc <- eval.results(model_fit) %>% filter(delta.AICc == 0)
  mod <- eval_res@models[[which(names(eval_res@models) == opt.aicc$tune.args)]]
  
  EvalBgRes  <- predict(mod, EvalBg, type = "cloglog")
  EvalPresRes  <- predict(mod, EvalPres, type = "cloglog")
  
  tmp_AUC[[i]]<-evaluate(c(EvalPresRes), c(EvalBgRes))
  umbral_maxent<-threshold( tmp_AUC[[i]])
  tmp_TSS[[i]]<-evaluate(p=c(EvalPresRes), a=c(EvalBgRes), tr=umbral_maxent$spec_sens)
}

AUC_maxent <- tmp_AUC
TSS_maxent <- tmp_TSS

AUC_maxent_vect  <- sapply(AUC_maxent,function(x){slot(x,'auc')})
TPR_maxent_vect  <- sapply(TSS_maxent,function(x){slot(x,'TPR')})
TNR_maxent_vect  <- sapply(TSS_maxent,function(x){slot(x,'TNR')})
TSS_maxent_vect  <- TPR_maxent_vect + TNR_maxent_vect - 1

stat_df <- data.frame(AUC = c(mean(AUC_maxent_vect), sd(AUC_maxent_vect)),
                      TSS = c(mean(TSS_maxent_vect), sd(TSS_maxent_vect)),
                      TPR = c(mean(TPR_maxent_vect), sd(TPR_maxent_vect)),
                      TNR = c(mean(TNR_maxent_vect), sd(TNR_maxent_vect)))

stat_df <- data.frame(statistic = c("AUC", "TSS", "TPR", "TNR"),
                      mean = c(mean(AUC_maxent_vect), mean(TSS_maxent_vect), mean(TPR_maxent_vect), mean(TNR_maxent_vect)),
                      sd = c(sd(TSS_maxent_vect), sd(TSS_maxent_vect), sd(TPR_maxent_vect), sd(TNR_maxent_vect)))
  
stat_df

2.2 Response curves

vs <- c("depth", "SST", "SSS", "Phyto", "ZooPl", "windfarms", "sub_char", "ene_char")

name_key <- data.frame(old = c("depth", "SST", "SSS", "Phyto", "ZooPl", "windfarms", "sub_char", "ene_char"),
                       new = c("Depth (m)%", "Sea surface temperature (°C)%", "Sea surface salinity (PSU)%", 
                               "Phytoplankton concentration%(mmol C / m³)", "Zooplankton concentration%(g C / m²)", 
                               "Windfarm presence", "Seabed substrate", "Seabed energy"))
addline_format <- function(x,...){
  gsub('%','\n',x)
}

for (i in 1:nrow(name_key)) {

  v <- name_key$old[i]
  out_name <- name_key$new[i]
  
  dat <- response.plot(mod, v, type = "cloglog", 
                ylab = "Probability of occurrence",
                plot = F)
  
  if(is.character(dat[1,1])) {
    assign(v, ggplot(dat) +
      geom_bar(aes_string(x = v, y = "pred"), stat='identity', fill = "#332288") +
      # scale_x_discrete(limits = rev(substr_key[which(substr_key %in% dat_lv$sub_char)])) +
      scale_y_continuous(limits = c(0,1)) +
      coord_flip() +
      labs(title = out_name, x = "", y = "Probability of presence") +
      theme_bw() +
      theme(legend.title = element_blank(),
            plot.title = element_text(size=10, face = "bold", colour = "black", hjust = 0.5),
            axis.text.y = element_text(size=10, face = "plain", colour = "black"),
            axis.text.x = element_text(size=10, face = "plain", colour = "black"),
            axis.title.x = element_text(size=10, face = "bold", colour = "black"),
            axis.title.y = element_text(size=10, face = "bold", colour = "black"),
            legend.text = element_text(size=10, face = "bold", colour = "black")))
  } else {
    
    #define plot bounds (restricted to where occurrences are present)
    min <- df_occ_bg_env %>% filter(presence == 1) %>% dplyr::select(all_of(v)) %>% min
    max <- df_occ_bg_env %>% filter(presence == 1) %>% dplyr::select(all_of(v)) %>% max
    
    assign(v, ggplot(dat) +
      geom_line(aes_string(x = v, y = "pred"), linewidth = 1, color = "#332288") + 
      labs(x = addline_format(out_name), y = "Probability of presence") +
      scale_x_continuous(limits = c(min, max)) +
      scale_y_continuous(limits = c(0,1)) +
      theme_bw() +
      theme(legend.title = element_text(size=10, face = "bold", colour = "black"),
            legend.text = element_text(size=10, face = "plain", colour = "black"), 
            axis.text.y = element_text(size=10, face = "plain", colour = "black"),
            axis.text.x = element_text(size=10, face = "plain", colour = "black"),
            axis.title.x = element_text(size=10, face = "bold", colour = "black"),
            axis.title.y = element_text(size=10, face = "bold", colour = "black")))
  }
}

grid.arrange(depth, SST, SSS, Phyto, ZooPl, windfarms, ene_char, sub_char)

3. Project model

3.1 Get raster slices to project on

using the getRasterSlice function

variables <- c("depth", "SST", "SSS", "Phyto",
               "ZooPl", "windfarms", "seabed_substrate", 
               "seabed_energy")

files <- tibble(name = list.files("data/raster_slices/", pattern = ".tif|.grd", full.names = TRUE),
                parameter = str_extract(name, pattern = paste0(variables, collapse = "|")),
                month = str_extract(name, pattern = "\\d{2}|\\d"))

#get model
eval_res <- model_fit
opt.aicc <- eval.results(model_fit) %>% filter(delta.AICc == 0)
mod <- eval_res@models[[which(names(eval_res@models) == opt.aicc$tune.args)]]

3.2 Spatial predictions

##TODO: delete this part
substr_lvl <- tibble(sub_char = c("Fine mud", "Sand", "Muddy sand", "Mixed sediment",
                                  "Coarse substrate","Sandy mud or Muddy sand", "Seabed",
                                  "Rock or other hard substrata","Sandy mud", "Sandy mud or Muddy sand ",
                                  "Sediment","Fine mud or Sandy mud or Muddy sand"),
                     seabed_substrate = c(1:12))
energy_lvl <- tibble(ene_char = c("High energy", "Moderate energy", "Low energy", "No energy information"),
                     seabed_energy = c(1:4))


predictions <- stack()
for (m in unique(df_occ_bg_env$month)) {
  
  plot_st <- stack(files %>% 
                     filter(month == m | is.na(month)) %>% 
                     dplyr::select(name) %>% 
                     pull())
  #get model
  
  names(plot_st) <- str_extract(names(plot_st), pattern = paste0(variables, collapse = "|"))
  names(plot_st)[which(names(plot_st) == "seabed_energy")] <- "ene_char"
  names(plot_st)[which(names(plot_st) == "seabed_substrate")] <- "sub_char"
  
  pred_m <- predict(plot_st, mod, clamp=T, type="cloglog",
                         factors = list(ene_char = factor(energy_lvl$seabed_energy,
                                                          labels = energy_lvl$ene_char,
                                                          levels = energy_lvl$seabed_energy),
                                        sub_char = factor(substr_lvl$seabed_substrate,
                                                          labels = substr_lvl$sub_char,
                                                          levels = substr_lvl$seabed_substrate)))
  # crop to extent of study area
  pred_m <- crop(pred_m, extent(-12, 10, 48, 62))
  
  names(pred_m) <- paste0("prediction_", month.name[m])

  predictions <- stack(predictions, pred_m)
}

plot(predictions)